********************************************************************************
** Code for "Closing the publishing gender gap in economics and political science: Does a critical mass matter?
* Author: Gabriela Galassi (ggalassi@bank-banque-canada.ca)
* Date: March 31, 2025
********************************************************************************
* Paths
global path "C:\Users\galg\OneDrive - Bank of Canada\MyDocs\Projects\gender gaps\plos one\gender_gaps_ecopoli"

********************************************************************************
* Open the data
import delimited using "$path\data\data_byuni.csv", clear

* Label variables
lab var name "Name of scholar"
lab var uni "University"
lab var pos_class "Position"
lab var distinguished_dummy "Distinguished professor"
lab var gender "Female"
lab var i10_index_all "i10-index"
lab var i10_index_6y "i10-index"
lab var discipline "Discipline"
lab var citations_all "Citations"
lab var citations_6y "Citations"
lab var citations_1 "Citations most cited paper"
lab var averagecitations "Citations per paper"
lab var averagecitations6y "Citations per paper"
lab var country "Country of the University"

* make country_1 numeric
encode country, gen(country_1)
lab var country_1 country_1

** DATA CORRECTIONS
* 1- Anja Fase was scraped as a different scholar
replace i10_index_all = 24 if name == "Anja FaÃÂe"
replace citations_all = 1473 if name == "Anja FaÃÂe"
replace citations_1 = 379 if name == "Anja FaÃÂe"
replace averagecitations = 1473/24 if name == "Anja FaÃÂe"
replace i10_index_6y = 21 if name == "Anja FaÃÂe" 
replace citations_6y = 1419 if name == "Anja FaÃÂe"
replace averagecitations6y = 1419 / 21 if name == "Anja FaÃÂe"
* 2- These names are not correct
foreach var in i10_index_all citations_all citations_1 averagecitations i10_index_6y citations_6y averagecitations6y {
	replace `var' = . if name == "Stephanie Jung" ///
		| ((name == "Xiaohong XU" | name == "Xiaohui Kong" | name == "Jing HAN") & uni =="Zhejiang University") ///
		| (name=="Li Jing" & uni == "Fudan University" & discipline=="poli")
}

** DESCRIPTIONS OF RAW DATA
* female representation per science
tab gender discipline if citations_all !=. & pos_class!= "", col

* ancillary variables for descriptives of raw data
gen scholars = 1 if citations_all !=. & pos_class!= ""
bysort uni gender discipline: egen scholars_g = sum(scholars)
bysort uni discipline: egen scholars_t = sum(scholars)
bysort uni gender discipline: gen n = _n

** TABLE A2	
* department-level data: distribution of men and women
bysort discipline: tabstat scholars_g if n==1, by (gender) s(mean sd min max)
bysort discipline: tabstat scholars_t if n==1, s(mean sd min max)

drop scholars* n

** TABLE 1
* descriptives: gender gaps
bysort discipline: tabstat i10_index_all citations_all citations_1 averagecitations if pos_class != "" & gender != ., s(mean N) by(gender)
bysort discipline: tabstat i10_index_6y citations_6y averagecitations6y if pos_class != "" & gender != ., s(mean N) by(gender)

** DEPARTMENT-LEVEL DATA CONSTRUCTION
* create the department-level averages by gender
foreach var in i10_index_all citations_all citations_1 averagecitations ///
	i10_index_6y citations_6y averagecitations6y {
	bysort uni gender discipline: egen `var'_g = mean(`var')
}

* male outcomes
preserve
	drop if pos_class == "" | citations_all == .
	keep if gender ==0
	collapse i10_index_all_g citations_all_g citations_1_g averagecitations_g i10_index_6y_g citations_6y_g averagecitations6y_g, by(uni discipline)
	foreach var in i10_index_all citations_all citations_1 averagecitations i10_index_6y citations_6y averagecitations6y {
		ren `var'_g `var'_m
	}
	save "$path\temp\temp_m.dta", replace
restore

* female outcomes
preserve
	drop if pos_class == "" | citations_all == .
	keep if gender ==1
	collapse i10_index_all_g citations_all_g citations_1_g averagecitations_g i10_index_6y_g citations_6y_g averagecitations6y_g, by(uni discipline)
	foreach var in i10_index_all citations_all citations_1 averagecitations i10_index_6y citations_6y averagecitations6y {
		ren `var'_g `var'_f
	}
	save "$path\temp\temp_f.dta", replace
restore

* female proportion and controls
preserve
	drop if pos_class == "" | citations_all == .
	gen assprop = pos_class=="Assistant Professor"
	collapse gender assprop distinguished_dummy country_1, by(uni discipline)
	ren distinguished_dummy distprop
	rename gender female
	save "$path\temp\temp_g.dta", replace
restore

* merge all
preserve
	use "$path\temp\temp_m.dta", clear
	merge 1:1 uni discipline using "$path\temp\temp_f.dta"
	cap drop _merge
	merge 1:1 uni discipline using "$path\temp\temp_g.dta"
	foreach var in i10_index_all citations_all citations_1 averagecitations i10_index_6y citations_6y averagecitations6y {
		gen `var'_gap = `var'_f/`var'_m * 100
		replace `var'_gap = . if uni == "Zhejiang University" & discipline == "econ"
		replace `var'_gap = . if (uni == "Fudan University" | uni == "Zhejiang University") & discipline == "poli"
	}
	replace female = 0 if uni == "Zhejiang University" & discipline == "econ" // correct misleading identification of female researchers
	replace female = 0 if uni == "Fudan University" & discipline == "poli"
	save "$path\temp\data_uni_fin.dta", replace
restore

** REGRESSIONS AND CHARTS
use "$path\temp\data_uni_fin.dta", clear

* generate a discipline numerical for the regressions
encode discipline, gen(disc_num)

** FIGURE 1
* i10-index
twoway (scatter i10_index_all_gap female if discipline=="econ", mcolor(blue) msymbol(circle)) ///
	(scatter i10_index_all_gap female if discipline=="poli", mcolor(green) msymbol(diamond)) ///
	(lowess i10_index_all_gap female, lcolor(black)), ///
	legend(order(1 "Economics" 2 "Political science")) ytitle("gender gap (inverse) in number of publications") xtitle("proportion of women") 
graph export "$path\output\uni_i10_index_lowess.jpg", as(jpg) replace

** FIGURE A1
* past 6 years
twoway (scatter i10_index_6y_gap female if discipline=="econ", mcolor(blue) msymbol(circle)) ///
	(scatter i10_index_6y_gap female if discipline=="poli", mcolor(green) msymbol(diamond)) ///
	(lowess i10_index_6y_gap female, lcolor(black)), ///
	legend(order(1 "Economics" 2 "Political science")) ytitle("gender gap (inverse) in number of publications") xtitle("proportion of women") 
graph export "$path\output\uni_i10_index_lowess_6y.jpg", as(jpg) replace

** FIGURE 2
* citations
twoway (scatter citations_all_gap female if discipline=="econ", mcolor(blue) msymbol(circle)) ///
	(scatter citations_all_gap female if discipline=="poli", mcolor(green) msymbol(diamond)) ///
	(lowess citations_all_gap female, lcolor(black)), ///
	legend(order(1 "Economics" 2 "Political science")) ytitle("gender gap (inverse) in total citations") xtitle("proportion of women") 
graph export "$path_output\uni_citations_lowess.pdf", as(pdf) replace
graph export "$path_output\uni_citations_lowess.jpg", as(jpg) replace

** FIGURE A2
* past 6 years
twoway (scatter citations_6y_gap female if discipline=="econ", mcolor(blue) msymbol(circle)) ///
	(scatter citations_6y_gap female if discipline=="poli", mcolor(green) msymbol(diamond)) ///
	(lowess citations_6y_gap female, lcolor(black)), ///
	legend(order(1 "Economics" 2 "Political science")) ytitle("gender gap (inverse) in total citations") xtitle("proportion of women") 
graph export "$path_output\uni_citations_lowess_6y.jpg", as(jpg) replace

** FIGURE 3
* citations of most cited publication
twoway (scatter citations_1_gap female if discipline=="econ", mcolor(blue) msymbol(circle)) ///
	(scatter citations_1_gap female if discipline=="poli", mcolor(green) msymbol(diamond)) ///
	(lowess citations_1_gap female, lcolor(black)), ///
	legend(order(1 "Economics" 2 "Political science")) ytitle("gender gap (inverse) in citations of most cited publication") xtitle("proportion of women") 
graph export "$path_output\uni_citations_1_lowess.jpg", as(jpg) replace

** FIGURE 4
* citations per publication
twoway (scatter averagecitations_gap female if discipline=="econ", mcolor(blue) msymbol(circle)) ///
	(scatter averagecitations_gap female if discipline=="poli", mcolor(green) msymbol(diamond)) ///
	(lowess averagecitations_gap female, lcolor(black)), ///
	legend(order(1 "Economics" 2 "Political science")) ytitle("gender gap (inverse) in citations per publcation") xtitle("proportion of women") 
graph export "$path_output\uni_averagecitations_lowess.jpg", as(jpg) replace

** FIGURE A3
* past 6 years
twoway (scatter averagecitations6y_gap female if uni != "Technical University of Munich" & discipline=="econ", mcolor(blue) msymbol(circle)) ///
	(scatter averagecitations6y_gap female if discipline=="poli", mcolor(green) msymbol(diamond)) ///
	(lowess averagecitations6y_gap female, lcolor(black)), ///
	legend(order(1 "Economics" 2 "Political science")) ytitle("gender gap (inverse) in citations per publication") xtitle("proportion of women") 
graph export "$path_output\uni_averagecitations_lowess_6y.pdf", as(pdf) replace
graph export "$path_output\uni_averagecitations_lowess_6y.jpg", as(jpg) replace

** TABLE 2
reg i10_index_all_gap c.female##i.disc_num, rob
margins disc_num, dydx(female)
reg i10_index_all_gap c.female##i.disc_num i.country_1 assprop distprop, rob
margins disc_num, dydx(female)

reg citations_all_gap c.female##i.disc_num, rob
margins disc_num, dydx(female)
reg citations_all_gap c.female##i.disc_num i.country_1 assprop distprop, rob
margins disc_num, dydx(female)

reg citations_1_gap c.female##i.disc_num, rob
margins disc_num, dydx(female)
reg citations_1_gap c.female##i.disc_num i.country_1 assprop distprop, rob
margins disc_num, dydx(female)

reg averagecitations_gap c.female##i.disc_num, rob
margins disc_num, dydx(female)
reg averagecitations_gap c.female##i.disc_num i.country_1 assprop distprop, rob
margins disc_num, dydx(female)
